Programming Elegant DataVis with tidyverse and ggplot2 R packages
packages = c('tidyverse','ggplot2','dplyr','patchwork',
'gganimate','plotly','treemap','d3Tree','ggstatsplot')
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
jobs <- read_csv("data/Jobs.csv")
emp <- read_csv("data/Employers.csv")
travel <- read_csv("data/TravelJournal.csv")
apartments <- read_csv("data/wkt/Apartments.csv
")
hires <- jobs %>%
group_by(employerId) %>% tally() %>%
arrange(desc(n))
# employerpay <- jobs %>%
# group_by(employerId) %>%
# dplyr::summarise(emppay = weeklypay)
#
# pay_hires <- merge(x = hires, y = employerpay, by = "employerId", all = TRUE) %>%
# mutate(employeepay = emppay / n) %>%
# arrange(desc(employeepay))
jobsnum <- jobs %>%
group_by(employerId) %>%
summarise(jobNum = n(),
totalPay = sum(hourlyRate),
avgPay = mean(hourlyRate))
jobsnum <- jobsnum %>%
rename('Average Hourly Pay' = 'avgPay') %>%
mutate(group = paste(jobNum, 'Employees'))
tm <- treemap(jobsnum,
index = c('group', 'employerId'),
vSize = 'totalPay',
vColor = 'Average Hourly Pay',
type = 'value',
title = 'Employee Wage by Workplace')
#d3Tree(tm, rootname = 'Employee Hourly Wage by Workplace')
#d3tree(tm, rootname = 'Employee Hourly Wage by Workplace')
d3tree(tm)
jobs<-jobs %>%
mutate(workinghours=difftime(jobs$endTime,jobs$startTime,units='hours')*5)
jobs<-jobs %>%
mutate(weeklypay=hourlyRate*workinghours)
jobs$weeklypay=as.numeric(jobs$weeklypay)
jobs <-jobs %>%
mutate(educationRequirement = factor(jobs$educationRequirement, level = c('Low', 'HighSchoolOrCollege','Bachelors','Graduate')))
weeklypay_education <- jobs %>%
group_by(educationRequirement) %>%
summarise(
n=n(),
mean=mean(weeklypay),
sd=sd(weeklypay))%>%
mutate(se=sd/sqrt(n-1))
knitr::kable(head(weeklypay_education), format = 'html')
| educationRequirement | n | mean | sd | se |
|---|---|---|---|---|
| Low | 119 | 490.8497 | 160.4249 | 14.768306 |
| HighSchoolOrCollege | 705 | 586.9923 | 247.5990 | 9.331737 |
| Bachelors | 330 | 934.3181 | 521.0650 | 28.727241 |
| Graduate | 174 | 1355.1308 | 676.4057 | 51.426170 |
ggplot(weeklypay_education) +
geom_errorbar(
aes(x=educationRequirement,
ymin=mean-se,
ymax=mean+se),
width=0.2,
colour="black",
alpha=0.9,
size=0.5) +
geom_point(aes
(x=educationRequirement,
y=mean),
stat="identity",
color="red",
size = 1.5,
alpha=1) +
ggtitle("Fg.1-2 Weekly pay vs educational requirement")+
theme(plot.title = element_text(hjust = 0.5))
p<- ggplot(jobs, aes(x = educationRequirement, y = hourlyRate, fill=educationRequirement)) +
ggdist::stat_halfeye(
adjust = .5,
width = .6,
.width = 0,
justification = -.3,
point_colour = NA) +
geom_boxplot(
width = .25,
outlier.shape = NA
) +
geom_point(
size = 1.3,
alpha = .3,
position = position_jitter(
seed = 1, width = .1
)
) +
coord_cartesian(xlim = c(1.2, NA), clip = "off")+
ggtitle(label = "Wage Distribution for Different Education Level",
subtitle = "High Wages For Higher Educated")+
theme_minimal()+
theme(plot.title = element_text(size=14, face="bold",hjust = 0.5),
plot.subtitle = element_text(size=12,hjust = 0.5,color='mediumvioletred'))+
theme(axis.title.y= element_text(angle=0), axis.ticks.x= element_blank(),
panel.background= element_blank(), axis.line= element_line(color= 'grey'))
ggplotly(p)
work_home <- travel %>%
filter(purpose == "Work/Home Commute") %>%
group_by(participantId,travelEndLocationId) %>%
tally() %>%
select('participantId','travelEndLocationId')
work <- inner_join(x = work_home, y = emp, by= c("travelEndLocationId"="employerId" )) %>%
select('participantId','travelEndLocationId') %>%
group_by(participantId) %>%
tally() %>%
rename('numberofplacesworked'='n')
workinmoreplaces = work %>%
filter(numberofplacesworked > 1) %>%
arrange(desc(numberofplacesworked))
gghistostats(
data = work,
x = numberofplacesworked,
xlab = "numbers of places worked",
title = "Distribution of turnover rate",
test.value = 1,
)